fastq filesThis step is performed on the O2 cluster. The fastq file quality was checked using FastQC and MultiQC. They are aligned to Ensembl mm10 genome and counted using Salmon pseudoaligner. Output sf files were transfered from O2 to local machine for further processing in R.
suppressMessages(
c(library(DESeq2),
library(limma),
library(tximport),
library(gridExtra),
library(ensembldb),
library(EnsDb.Mmusculus.v79),
library(grid),
library(ggplot2),
library(lattice),
library(reshape),
library(mixOmics),
library(gplots),
library(RColorBrewer),
library(readr),
library(dplyr),
library(VennDiagram),
library(clusterProfiler),
library(DOSE),
library(org.Mm.eg.db),
library(pathview),
library(AnnotationDbi),
library(knitr))
)
## [1] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [4] "matrixStats" "Biobase" "GenomicRanges"
## [7] "GenomeInfoDb" "IRanges" "S4Vectors"
## [10] "BiocGenerics" "parallel" "stats4"
## [13] "stats" "graphics" "grDevices"
## [16] "utils" "datasets" "methods"
## [19] "base" "limma" "DESeq2"
## [22] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [25] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [28] "IRanges" "S4Vectors" "BiocGenerics"
## [31] "parallel" "stats4" "stats"
## [34] "graphics" "grDevices" "utils"
## [37] "datasets" "methods" "base"
## [40] "tximport" "limma" "DESeq2"
## [43] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [46] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [49] "IRanges" "S4Vectors" "BiocGenerics"
## [52] "parallel" "stats4" "stats"
## [55] "graphics" "grDevices" "utils"
## [58] "datasets" "methods" "base"
## [61] "gridExtra" "tximport" "limma"
## [64] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [67] "matrixStats" "Biobase" "GenomicRanges"
## [70] "GenomeInfoDb" "IRanges" "S4Vectors"
## [73] "BiocGenerics" "parallel" "stats4"
## [76] "stats" "graphics" "grDevices"
## [79] "utils" "datasets" "methods"
## [82] "base" "ensembldb" "AnnotationFilter"
## [85] "GenomicFeatures" "AnnotationDbi" "gridExtra"
## [88] "tximport" "limma" "DESeq2"
## [91] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [94] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [97] "IRanges" "S4Vectors" "BiocGenerics"
## [100] "parallel" "stats4" "stats"
## [103] "graphics" "grDevices" "utils"
## [106] "datasets" "methods" "base"
## [109] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [112] "GenomicFeatures" "AnnotationDbi" "gridExtra"
## [115] "tximport" "limma" "DESeq2"
## [118] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [121] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [124] "IRanges" "S4Vectors" "BiocGenerics"
## [127] "parallel" "stats4" "stats"
## [130] "graphics" "grDevices" "utils"
## [133] "datasets" "methods" "base"
## [136] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [139] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [142] "gridExtra" "tximport" "limma"
## [145] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [148] "matrixStats" "Biobase" "GenomicRanges"
## [151] "GenomeInfoDb" "IRanges" "S4Vectors"
## [154] "BiocGenerics" "parallel" "stats4"
## [157] "stats" "graphics" "grDevices"
## [160] "utils" "datasets" "methods"
## [163] "base" "ggplot2" "grid"
## [166] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [169] "GenomicFeatures" "AnnotationDbi" "gridExtra"
## [172] "tximport" "limma" "DESeq2"
## [175] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [178] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [181] "IRanges" "S4Vectors" "BiocGenerics"
## [184] "parallel" "stats4" "stats"
## [187] "graphics" "grDevices" "utils"
## [190] "datasets" "methods" "base"
## [193] "lattice" "ggplot2" "grid"
## [196] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [199] "GenomicFeatures" "AnnotationDbi" "gridExtra"
## [202] "tximport" "limma" "DESeq2"
## [205] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [208] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [211] "IRanges" "S4Vectors" "BiocGenerics"
## [214] "parallel" "stats4" "stats"
## [217] "graphics" "grDevices" "utils"
## [220] "datasets" "methods" "base"
## [223] "reshape" "lattice" "ggplot2"
## [226] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [229] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [232] "gridExtra" "tximport" "limma"
## [235] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [238] "matrixStats" "Biobase" "GenomicRanges"
## [241] "GenomeInfoDb" "IRanges" "S4Vectors"
## [244] "BiocGenerics" "parallel" "stats4"
## [247] "stats" "graphics" "grDevices"
## [250] "utils" "datasets" "methods"
## [253] "base" "mixOmics" "MASS"
## [256] "reshape" "lattice" "ggplot2"
## [259] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [262] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [265] "gridExtra" "tximport" "limma"
## [268] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [271] "matrixStats" "Biobase" "GenomicRanges"
## [274] "GenomeInfoDb" "IRanges" "S4Vectors"
## [277] "BiocGenerics" "parallel" "stats4"
## [280] "stats" "graphics" "grDevices"
## [283] "utils" "datasets" "methods"
## [286] "base" "gplots" "mixOmics"
## [289] "MASS" "reshape" "lattice"
## [292] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [295] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [298] "AnnotationDbi" "gridExtra" "tximport"
## [301] "limma" "DESeq2" "SummarizedExperiment"
## [304] "DelayedArray" "matrixStats" "Biobase"
## [307] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [310] "S4Vectors" "BiocGenerics" "parallel"
## [313] "stats4" "stats" "graphics"
## [316] "grDevices" "utils" "datasets"
## [319] "methods" "base" "RColorBrewer"
## [322] "gplots" "mixOmics" "MASS"
## [325] "reshape" "lattice" "ggplot2"
## [328] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [331] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [334] "gridExtra" "tximport" "limma"
## [337] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [340] "matrixStats" "Biobase" "GenomicRanges"
## [343] "GenomeInfoDb" "IRanges" "S4Vectors"
## [346] "BiocGenerics" "parallel" "stats4"
## [349] "stats" "graphics" "grDevices"
## [352] "utils" "datasets" "methods"
## [355] "base" "readr" "RColorBrewer"
## [358] "gplots" "mixOmics" "MASS"
## [361] "reshape" "lattice" "ggplot2"
## [364] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [367] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [370] "gridExtra" "tximport" "limma"
## [373] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [376] "matrixStats" "Biobase" "GenomicRanges"
## [379] "GenomeInfoDb" "IRanges" "S4Vectors"
## [382] "BiocGenerics" "parallel" "stats4"
## [385] "stats" "graphics" "grDevices"
## [388] "utils" "datasets" "methods"
## [391] "base" "dplyr" "readr"
## [394] "RColorBrewer" "gplots" "mixOmics"
## [397] "MASS" "reshape" "lattice"
## [400] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [403] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [406] "AnnotationDbi" "gridExtra" "tximport"
## [409] "limma" "DESeq2" "SummarizedExperiment"
## [412] "DelayedArray" "matrixStats" "Biobase"
## [415] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [418] "S4Vectors" "BiocGenerics" "parallel"
## [421] "stats4" "stats" "graphics"
## [424] "grDevices" "utils" "datasets"
## [427] "methods" "base" "VennDiagram"
## [430] "futile.logger" "dplyr" "readr"
## [433] "RColorBrewer" "gplots" "mixOmics"
## [436] "MASS" "reshape" "lattice"
## [439] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [442] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [445] "AnnotationDbi" "gridExtra" "tximport"
## [448] "limma" "DESeq2" "SummarizedExperiment"
## [451] "DelayedArray" "matrixStats" "Biobase"
## [454] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [457] "S4Vectors" "BiocGenerics" "parallel"
## [460] "stats4" "stats" "graphics"
## [463] "grDevices" "utils" "datasets"
## [466] "methods" "base" "clusterProfiler"
## [469] "VennDiagram" "futile.logger" "dplyr"
## [472] "readr" "RColorBrewer" "gplots"
## [475] "mixOmics" "MASS" "reshape"
## [478] "lattice" "ggplot2" "grid"
## [481] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [484] "GenomicFeatures" "AnnotationDbi" "gridExtra"
## [487] "tximport" "limma" "DESeq2"
## [490] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [493] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [496] "IRanges" "S4Vectors" "BiocGenerics"
## [499] "parallel" "stats4" "stats"
## [502] "graphics" "grDevices" "utils"
## [505] "datasets" "methods" "base"
## [508] "DOSE" "clusterProfiler" "VennDiagram"
## [511] "futile.logger" "dplyr" "readr"
## [514] "RColorBrewer" "gplots" "mixOmics"
## [517] "MASS" "reshape" "lattice"
## [520] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [523] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [526] "AnnotationDbi" "gridExtra" "tximport"
## [529] "limma" "DESeq2" "SummarizedExperiment"
## [532] "DelayedArray" "matrixStats" "Biobase"
## [535] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [538] "S4Vectors" "BiocGenerics" "parallel"
## [541] "stats4" "stats" "graphics"
## [544] "grDevices" "utils" "datasets"
## [547] "methods" "base" "org.Mm.eg.db"
## [550] "DOSE" "clusterProfiler" "VennDiagram"
## [553] "futile.logger" "dplyr" "readr"
## [556] "RColorBrewer" "gplots" "mixOmics"
## [559] "MASS" "reshape" "lattice"
## [562] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [565] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [568] "AnnotationDbi" "gridExtra" "tximport"
## [571] "limma" "DESeq2" "SummarizedExperiment"
## [574] "DelayedArray" "matrixStats" "Biobase"
## [577] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [580] "S4Vectors" "BiocGenerics" "parallel"
## [583] "stats4" "stats" "graphics"
## [586] "grDevices" "utils" "datasets"
## [589] "methods" "base" "pathview"
## [592] "org.Hs.eg.db" "org.Mm.eg.db" "DOSE"
## [595] "clusterProfiler" "VennDiagram" "futile.logger"
## [598] "dplyr" "readr" "RColorBrewer"
## [601] "gplots" "mixOmics" "MASS"
## [604] "reshape" "lattice" "ggplot2"
## [607] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [610] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [613] "gridExtra" "tximport" "limma"
## [616] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [619] "matrixStats" "Biobase" "GenomicRanges"
## [622] "GenomeInfoDb" "IRanges" "S4Vectors"
## [625] "BiocGenerics" "parallel" "stats4"
## [628] "stats" "graphics" "grDevices"
## [631] "utils" "datasets" "methods"
## [634] "base" "pathview" "org.Hs.eg.db"
## [637] "org.Mm.eg.db" "DOSE" "clusterProfiler"
## [640] "VennDiagram" "futile.logger" "dplyr"
## [643] "readr" "RColorBrewer" "gplots"
## [646] "mixOmics" "MASS" "reshape"
## [649] "lattice" "ggplot2" "grid"
## [652] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [655] "GenomicFeatures" "AnnotationDbi" "gridExtra"
## [658] "tximport" "limma" "DESeq2"
## [661] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [664] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [667] "IRanges" "S4Vectors" "BiocGenerics"
## [670] "parallel" "stats4" "stats"
## [673] "graphics" "grDevices" "utils"
## [676] "datasets" "methods" "base"
## [679] "knitr" "pathview" "org.Hs.eg.db"
## [682] "org.Mm.eg.db" "DOSE" "clusterProfiler"
## [685] "VennDiagram" "futile.logger" "dplyr"
## [688] "readr" "RColorBrewer" "gplots"
## [691] "mixOmics" "MASS" "reshape"
## [694] "lattice" "ggplot2" "grid"
## [697] "EnsDb.Mmusculus.v79" "ensembldb" "AnnotationFilter"
## [700] "GenomicFeatures" "AnnotationDbi" "gridExtra"
## [703] "tximport" "limma" "DESeq2"
## [706] "SummarizedExperiment" "DelayedArray" "matrixStats"
## [709] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [712] "IRanges" "S4Vectors" "BiocGenerics"
## [715] "parallel" "stats4" "stats"
## [718] "graphics" "grDevices" "utils"
## [721] "datasets" "methods" "base"
Set working directory to the folder that contains only gene count sf files
# Generate a tx2gene object for matrix generation
edb <- EnsDb.Mmusculus.v79
transcriptsID <- as.data.frame(transcripts(edb))
tx2gene <- as.data.frame(cbind(transcriptsID$tx_id, transcriptsID$gene_id))
# Generate DESeqData using the counting result generated using Salmon
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Gene Counts/TCT")
inDir = getwd()
countFiles = list.files(inDir, pattern=".sf$", full.names = TRUE)
basename(countFiles)
## [1] "I_MKI1_3963_LIB037245_GEN00137855_R1.sf"
## [2] "I_MKI2_1711_LIB037245_GEN00137858_R1.sf"
## [3] "I_MKI3_3504_LIB041682_GEN00156317_R1.sf"
## [4] "I_MKI4_3501_LIB041682_GEN00156318_R1.sf"
## [5] "I_MKI5_3503_LIB041682_GEN00156319_R1.sf"
## [6] "I_MKI6_LS78_LIB044271_GEN00169444_R1.sf"
## [7] "I_V1_3999_LIB041682_GEN00156316_R1.sf"
## [8] "I_V2_KL3687_LIB044271_GEN00169448_R1.sf"
## [9] "I_V3_KL3693_LIB044271_GEN00169449_R1.sf"
## [10] "I_V4_KL3823_LIB044271_GEN00169451_R1.sf"
## [11] "I_V5_KL3912_LIB044271_GEN00169453_R1.sf"
## [12] "I_V6_LS79_LIB044271_GEN00169445_R1.sf"
## [13] "NI_MKI1_3976_LIB037245_GEN00137847_R1.sf"
## [14] "NI_MKI2_3960_LIB037245_GEN00137848_R1.sf"
## [15] "NI_MKI3_3561_LIB037245_GEN00137849_R1.sf"
## [16] "NI_MKI4_3462_LIB037245_GEN00137850_R1.sf"
## [17] "NI_MKI5_3505_LIB041682_GEN00156321_R1.sf"
## [18] "NI_MKI6_KL3695_LIB044271_GEN00169450_R1.sf"
## [19] "NI_V1_3974_LIB037245_GEN00137843_R1.sf"
## [20] "NI_V2_3975_LIB037245_GEN00137844_R1.sf"
## [21] "NI_V3_3962_LIB037245_GEN00137845_R1.sf"
## [22] "NI_V4_3461_LIB037245_GEN00137846_R1.sf"
## [23] "NI_V5_3914_LIB041682_GEN00156320_R1.sf"
## [24] "NI_V6_KL3688_LIB044271_GEN00169446_R1.sf"
## [25] "NI_V7_KL3832_LIB044271_GEN00169452_R1.sf"
names(countFiles) <- c('I_MKI_1','I_MKI_2','I_MKI_3','I_MKI_4','I_MKI_5','I_MKI_6', 'I_V_1', 'I_V_2', 'I_V_3', 'I_V_4', 'I_V_5', 'I_V_6', 'NI_MKI_1','NI_MKI_2','NI_MKI_3','NI_MKI_4', 'NI_MKI_5', 'NI_MKI_6', 'NI_V_1', 'NI_V_2', 'NI_V_3', 'NI_V_4', 'NI_V_5', 'NI_V_6', 'NI_V_7')
countFiles_sum <- countFiles[-c(21,23)]
txi.salmon <- tximport(countFiles_sum, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable_all <- data.frame(condition = c(rep('experimental', 12), rep('control', 13)))
DESeqsampletable_all$batch <- factor(c('1','1','3','3','3','4','3','4','4','4','4','4','1','1','1','1','3','4','1','1','1','1','3','4','4'))
DESeqsampletable_all$gender <- factor(c('F','F','M','F','M','F','F','M','F','M','F','M','F','F','F','M','M','M','F','F','F','F','M','M','M'))
DESeqsampletable_sum <- DESeqsampletable_all[-c(21,23),]
rownames(DESeqsampletable_sum) <- colnames(txi.salmon$counts)
ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable_sum, ~ batch + gender + condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis, type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
MA plot was generated to inspect the correct shrinkage of LFC.
DESeq2::plotMA(ddsHTSeq_analysis)
Data is transformed and pseudocount is calculated.
rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
grid.arrange(
ggplot(pseudoCount, aes(x= I_MKI_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_1"),
ggplot(pseudoCount, aes(x= I_MKI_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_2"),
ggplot(pseudoCount, aes(x= I_MKI_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_3"),
ggplot(pseudoCount, aes(x= I_MKI_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_4"),
ggplot(pseudoCount, aes(x= I_MKI_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_5"),
ggplot(pseudoCount, aes(x= I_MKI_6)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_MKI_6"),
nrow = 2)
grid.arrange(
ggplot(pseudoCount, aes(x= I_V_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_1"),
ggplot(pseudoCount, aes(x= I_V_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_2"),
ggplot(pseudoCount, aes(x= I_V_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_3"),
ggplot(pseudoCount, aes(x= I_V_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_4"),
ggplot(pseudoCount, aes(x= I_V_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_5"),
ggplot(pseudoCount, aes(x= I_V_6)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "I_V_6"),
nrow = 2)
grid.arrange(
ggplot(pseudoCount, aes(x= NI_MKI_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_1"),
ggplot(pseudoCount, aes(x= NI_MKI_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_2"),
ggplot(pseudoCount, aes(x= NI_MKI_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_3"),
ggplot(pseudoCount, aes(x= NI_MKI_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_4"),
ggplot(pseudoCount, aes(x= NI_MKI_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_5"),
ggplot(pseudoCount, aes(x= NI_MKI_6)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_MKI_6"),
nrow = 2)
grid.arrange(
ggplot(pseudoCount, aes(x= NI_V_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_1"),
ggplot(pseudoCount, aes(x= NI_V_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_2"),
ggplot(pseudoCount, aes(x= NI_V_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_4"),
ggplot(pseudoCount, aes(x= NI_V_6)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_6"),
ggplot(pseudoCount, aes(x= NI_V_7)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "NI_V_7"),
nrow = 2)
Check on the gene count distribution across all genes.
#Boxplots
suppressMessages(df <- melt(pseudoCount, variable_name = "Samples"))
df <- data.frame(df, Condition = substr(df$Samples,1,4))
ggplot(df, aes(x=Samples, y=value, fill = Condition)) + geom_boxplot() + xlab("") +
ylab(expression(log[2](count+1))) + scale_fill_manual(values = c("#619CFF", "#F564E3", "#E69F00", "#FF0000")) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
#Histograms and density plots
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + ylim(c(0, 0.25)) +
geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
theme(legend.position = "top") + xlab(expression(log[2](count+1)))
This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf
keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
if (sum(rawCountTable[i,1:6] != 0) >=2 | sum(rawCountTable[i,7:12] != 0) >= 2 | sum(rawCountTable[i,13:18] != 0) >= 2 | sum(rawCountTable[i,19:23] != 0) >= 2) {
keep <- c(keep, i)
}
}
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_TCT.csv")
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
pdf('Hierchical_Clustering.pdf')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
include_graphics('Hierchical_Clustering.png')
I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.
The top 500 most variable genes are selected for PCA analysis.
plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500) +
geom_text(aes(label=name), vjust = 2) +
xlim(-50, 50) + ylim(-30, 45)
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing missing values (geom_text).
Set working directory to the folder that contains only gene count sf files
# Generate DESeqData using the counting result generated using Salmon
countFiles_vcompare <- c(countFiles[19:25], countFiles[7:12])
countFiles_vcompare <- countFiles_vcompare[c(-3,-5)]
txi.salmon <- tximport(countFiles_vcompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(19:25, 7:12), ]
DESeqsampletable <- DESeqsampletable[c(-3,-5),]
ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ gender + batch + condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis, type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
MA plot was generated to inspect the correct shrinkage of LFC.
DESeq2::plotMA(ddsHTSeq_analysis)
This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf
rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering_I_V vs NI_V.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
pdf('Hierchical_Clustering_I_V vs NI_V.pdf')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
include_graphics('Hierchical_Clustering_I_V vs NI_V.png')
I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.
The top 500 most variable genes are selected for PCA analysis.
plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500)
pdf("PCA_TCT_IV-vs-NIV.pdf",
height = 6,
width = 8)
plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500)
dev.off()
## quartz_off_screen
## 2
This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.
keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
if (sum(rawCountTable[i,1:5] != 0) >=2 | sum(rawCountTable[i,6:11] != 0) >= 2) {
keep <- c(keep, i)
}
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_I_V vs NI_V.csv")
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) +
geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
theme(legend.position = "top") + xlab("pseudocounts")
This step generates the analysis file that contains results from differential analysis.
write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_I_V vs NI_V.csv")
Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.
suppressMessages(library(mosaic))
rawCountTable_transform_detected_iv_vs_niv <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected_iv_vs_niv, "VST_I_V vs NI_V.csv")
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.05 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected_iv_vs_niv)))
}
sig_count <- rawCountTable_transform_detected_iv_vs_niv[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,7:17] <- zscore(as.numeric(sig_dif[i,7:17]))
}
my_palette <- colorRampPalette(c("blue", "white", "red"))(128)
heatmap_matrix <- as.matrix(sig_dif[,7:17])
png('I_V vs NI_V RNASeq.png',
width = 600,
height = 1200,
res = 200,
pointsize = 8)
heatmap.2(heatmap_matrix,
main = "I_V vs NI_V RNASeq\npadj < 0.05\nLFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "row",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
pdf('I_V vs NI_V RNASeq.pdf',
width = 6,
height = 10)
heatmap.2(heatmap_matrix,
main = "I_V vs NI_V RNASeq\npadj < 0.05\nLFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "row",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
include_graphics('I_V vs NI_V RNASeq.png')
# output number of significant DE genes
dim(sig_dif)[1]
## [1] 309
write.csv(sig_dif, "Significant genes_I_V vs NI_V.csv")
# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$I_V_mean <- rowMeans(detected_pseudocount[,6:11])
dif_analysis$NI_V_mean <- rowMeans(detected_pseudocount[,1:5])
(scatter <- ggplot(dif_analysis, aes(x = NI_V_mean, y = I_V_mean)) +
xlab("NI_V_Average(log2)") + ylab("I_V_Average(log2)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V Scatter Plot"))
pdf('ScatterPlot_I_V vs NI_V.pdf')
scatter
dev.off()
## quartz_off_screen
## 2
# MA plot
(ma <- ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V MA Plot"))
pdf('MAPlot_I_V vs NI_V.pdf')
ma
dev.off()
## quartz_off_screen
## 2
# Volcano Plot
(volcano <- ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V Volcano Plot"))
## Warning: Removed 26 rows containing missing values (geom_point).
pdf('VolcanoPlot_I_V vs NI_V.pdf')
volcano
## Warning: Removed 26 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen
## 2
Classic GO analysis is performed here for all DE genes detected in this dataset. The reference list is list of genes detected in RNASeq. Three categories of GO terms are tested here, including biological process, molecular function and cellular component.
target_gene <- as.character(rownames(sig_dif))
detected_gene <- as.character(rownames(detected_pseudocount))
# Run GO enrichment analysis for biological process
ego_BP <- enrichGO(gene = target_gene,
universe = detected_gene,
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "BP",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
# Output results from GO analysis to a table
cluster_summary_BP <- data.frame(ego_BP)
write.csv(cluster_summary_BP, "GO analysis/GO analysis_BP_I_V vs NI_V.csv")
# Run GO enrichment analysis for molecular function
ego_MF <- enrichGO(gene = target_gene,
universe = detected_gene,
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "MF",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
# Output results from GO analysis to a table
cluster_summary_MF <- data.frame(ego_MF)
write.csv(cluster_summary_MF, "GO analysis/GO analysis_MF_I_V vs NI_V.csv")
# Run GO enrichment analysis for cellular component
ego_CC <- enrichGO(gene = target_gene,
universe = detected_gene,
keyType = "ENSEMBL",
OrgDb = org.Mm.eg.db,
ont = "CC",
pAdjustMethod = "BH",
pvalueCutoff = 0.05,
readable = TRUE)
# Output results from GO analysis to a table
cluster_summary_CC <- data.frame(ego_CC)
write.csv(cluster_summary_CC, "GO analysis/GO analysis_CC_I_V vs NI_V.csv")
png('GO analysis/GO dotplot_BP_I_V vs NI_V.png',
width = 800,
height = 400,
res = 100,
pointsize = 8)
dotplot(ego_BP, showCategory=10)
dev.off()
## quartz_off_screen
## 2
pdf('GO analysis/GO dotplot_BP_I_V vs NI_V.pdf',
width = 10,
height = 4)
dotplot(ego_BP, showCategory=10)
dev.off()
## quartz_off_screen
## 2
include_graphics('GO analysis/GO dotplot_BP_I_V vs NI_V.png')
png('GO analysis/GO dotplot_MF_I_V vs NI_V.png',
width = 800,
height = 400,
res = 100,
pointsize = 8)
dotplot(ego_MF, showCategory=10)
dev.off()
## quartz_off_screen
## 2
pdf('GO analysis/GO dotplot_MF_I_V vs NI_V.pdf',
width = 8,
height = 4)
dotplot(ego_MF, showCategory=10)
dev.off()
## quartz_off_screen
## 2
include_graphics('GO analysis/GO dotplot_MF_I_V vs NI_V.png')
png('GO analysis/GO dotplot_CC_I_V vs NI_V.png',
width = 800,
height = 400,
res = 100,
pointsize = 8)
dotplot(ego_CC, showCategory=10)
dev.off()
## quartz_off_screen
## 2
pdf('GO analysis/GO dotplot_CC_I_V vs NI_V.pdf',
width = 8,
height = 4)
dotplot(ego_CC, showCategory=50)
dev.off()
## quartz_off_screen
## 2
include_graphics('GO analysis/GO dotplot_CC_I_V vs NI_V.png')
png('GO analysis/GO enrichment_BP_I_V vs NI_V.png',
width = 1600,
height = 1600,
res = 100,
pointsize = 8)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
pdf('GO analysis/GO enrichment_BP_I_V vs NI_V.pdf',
width = 16,
height = 16)
emapplot(ego_BP, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
include_graphics('GO analysis/GO enrichment_BP_I_V vs NI_V.png')
png('GO analysis/GO enrichment_MF_I_V vs NI_V.png',
width = 1200,
height = 1200,
res = 100,
pointsize = 8)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
pdf('GO analysis/GO enrichment_MF_I_V vs NI_V.pdf',
width = 12,
height = 12)
emapplot(ego_MF, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
include_graphics('GO analysis/GO enrichment_MF_I_V vs NI_V.png')
png('GO analysis/GO enrichment_CC_I_V vs NI_V.png',
width = 1000,
height = 1000,
res = 100,
pointsize = 8)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
pdf('GO analysis/GO enrichment_CC_I_V vs NI_V.pdf',
width = 10,
height = 10)
emapplot(ego_CC, showCategory = 50)
dev.off()
## quartz_off_screen
## 2
include_graphics('GO analysis/GO enrichment_CC_I_V vs NI_V.png')
The category netplot shows the relationships between the genes associated with the top five most significant GO terms and the fold changes of the significant genes associated with these terms (color). ##### Biological process
OE_foldchanges <- sig_dif$log2FoldChange
names(OE_foldchanges) <- rownames(sig_dif)
png('GO analysis/GO cnetplot_BP_I_V vs NI_V.png',
width = 1600,
height = 1600,
res = 100,
pointsize = 8)
cnetplot(ego_BP,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
pdf('GO analysis/GO cnetplot_BP_I_V vs NI_V.pdf',
width = 16,
height = 16)
cnetplot(ego_BP,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
include_graphics('GO analysis/GO cnetplot_BP_I_V vs NI_V.png')
png('GO analysis/GO cnetplot_MF_I_V vs NI_V.png',
width = 1600,
height = 1600,
res = 100,
pointsize = 8)
cnetplot(ego_MF,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
pdf('GO analysis/GO cnetplot_MF_I_V vs NI_V.pdf',
width = 16,
height = 16)
cnetplot(ego_MF,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
include_graphics('GO analysis/GO cnetplot_MF_I_V vs NI_V.png')
png('GO analysis/GO cnetplot_CC_I_V vs NI_V.png',
width = 1600,
height = 1600,
res = 100,
pointsize = 8)
cnetplot(ego_CC,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
pdf('GO analysis/GO cnetplot_CC_I_V vs NI_V.pdf',
width = 16,
height = 16)
cnetplot(ego_CC,
categorySize="pvalue",
showCategory = 5,
foldChange=OE_foldchanges,
vertex.label.font=6)
dev.off()
## quartz_off_screen
## 2
include_graphics('GO analysis/GO cnetplot_CC_I_V vs NI_V.png')
Set working directory to the folder that contains only gene count txt files
I am filtering out I_V3.
# Generate DESeqData using the counting result generated using Salmon
countFiles_vcompare <- c(countFiles[19:25], countFiles[7:12])
countFiles_vcompare <- countFiles_vcompare[c(-3,-5,-10)]
txi.salmon <- tximport(countFiles_vcompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(19:25, 7:12), ]
DESeqsampletable <- DESeqsampletable[c(-3,-5,-10),]
ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ gender + batch + condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis, type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
MA plot was generated to inspect the correct shrinkage of LFC.
DESeq2::plotMA(ddsHTSeq_analysis)
This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf
rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering_I_V vs NI_V_filtered.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
Final output is following:
I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.
The top 500 most variable genes are selected for PCA analysis.
plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500) +
geom_text(aes(label=name), vjust = 2) +
xlim(-50, 55) + ylim(-30, 50)
This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.
keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
if (sum(rawCountTable[i,1:5] != 0) >=2 | sum(rawCountTable[i,6:10] != 0) >= 2) {
keep <- c(keep, i)
}
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_I_V vs NI_V_filtered.csv")
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) +
geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
theme(legend.position = "top") + xlab("pseudocounts")
This step generates the analysis file that contains results from differential analysis.
write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_I_V vs NI_V_filtered.csv")
Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.
suppressMessages(library(mosaic))
rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_I_V vs NI_V_filtered.csv")
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.05 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,7:16] <- zscore(as.numeric(sig_dif[i,7:16]))
}
heatmap_matrix <- as.matrix(sig_dif[,7:16])
png('I_V vs NI_V RNASeq_filtered.png',
width = 600,
height = 1200,
res = 200,
pointsize = 8)
heatmap.2(heatmap_matrix,
main = "I_V vs NI_V RNASeq\npadj < 0.05\nLFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "row",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
# output number of significant DE genes
dim(sig_dif)[1]
## [1] 433
write.csv(sig_dif, "Significant genes_I_V vs NI_V_filtered.csv")
Final output is
# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$I_V_mean <- rowMeans(detected_pseudocount[,6:10])
dif_analysis$NI_V_mean <- rowMeans(detected_pseudocount[,1:5])
ggplot(dif_analysis, aes(x = NI_V_mean, y = I_V_mean)) +
xlab("NI_V_Average(log2)") + ylab("I_V_Average(log2)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V Scatter Plot")
# MA plot
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V MA Plot")
# Volcano Plot
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_V vs NI_V Volcano Plot")
## Warning: Removed 12 rows containing missing values (geom_point).
Set working directory to the folder that contains only gene count txt files
# Generate DESeqData using the counting result generated using Salmon
countFiles_icompare <- c(countFiles[7:12], countFiles[1:6])
txi.salmon <- tximport(countFiles_icompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(7:12, 1:6), ]
DESeqsampletable$condition <- factor(c(rep('control', 6), rep('experimental', 6)))
ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ gender + batch + condition)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis, type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
MA plot was generated to inspect the correct shrinkage of LFC.
DESeq2::plotMA(ddsHTSeq_analysis)
This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf
rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering_I_MKI vs I_V.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
pdf('Hierchical_Clustering_I_MKI vs I_V.pdf')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
include_graphics('Hierchical_Clustering_I_MKI vs I_V.png')
I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.
The top 500 most variable genes are selected for PCA analysis.
(PCA <- plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500))
pdf("PCA_TCT_IMKi-vs-IV.pdf",
height = 6,
width = 8)
PCA
dev.off()
## quartz_off_screen
## 2
This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.
I’m applying a different filtering criteria here. After discussing with Lucia, we decided to only look at genes that are expressed in 3 or more samples in each group.
keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
if (sum(rawCountTable[i,1:6] != 0) >=2 | sum(rawCountTable[i,7:12] != 0) >= 2) {
keep <- c(keep, i)
}
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_I_MKI vs I_V.csv")
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) +
geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
theme(legend.position = "top") + xlab("pseudocounts")
This step generates the analysis file that contains results from differential analysis.
write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_I_MKI vs I_V.csv")
Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.
Here I relax the padj cutoff to 0.1
suppressMessages(library(mosaic))
rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_I_MKI vs I_V.csv")
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,7:18] <- zscore(as.numeric(sig_dif[i,7:18]))
}
heatmap_matrix <- as.matrix(sig_dif[,7:18])
png('I_MKI vs I_V RNASeq.png',
width = 600,
height = 800,
res = 100,
pointsize = 8)
heatmap.2(heatmap_matrix,
main = "I_MKI vs I_V RNASeq\npadj < 0.1\nLFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "row",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
pdf('I_MKI vs I_V RNASeq.pdf',
width = 6,
height = 10)
heatmap.2(heatmap_matrix,
main = "I_MKI vs I_V RNASeq\npadj < 0.1\nLFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "row",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
include_graphics('I_MKI vs I_V RNASeq.png')
# output number of significant DE genes
dim(sig_dif)[1]
## [1] 17
write.csv(sig_dif, "Significant genes_I_MKI vs I_V.csv")
# rearrange the TCT master count matrix
rawCountTable_transform_detected_TCT <- read_csv("VST_TCT.csv", )
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_character()
## )
## See spec(...) for full column specifications.
row_names <- rawCountTable_transform_detected_TCT$X1
rawCountTable_transform_detected_TCT$X1 <- NULL
rawCountTable_transform_detected_reshape <- cbind(rawCountTable_transform_detected_TCT[,19:23], rawCountTable_transform_detected_TCT[,7:12], rawCountTable_transform_detected_TCT[,1:6])
rownames(rawCountTable_transform_detected_reshape) <- row_names
# plot the heatmap
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected_reshape)))
}
sig_count <- rawCountTable_transform_detected_reshape[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,7:23] <- zscore(as.numeric(sig_dif[i,7:23]))
}
heatmap_matrix <- as.matrix(sig_dif[,7:23])
png('TCT MK2i targets in TCT NI_V vs I_V vs I_MK2i RNASeq.png',
width = 600,
height = 700,
res = 100,
pointsize = 8)
heatmap.2(heatmap_matrix,
main = "TCT MK2i targets\nTCT I_V vs NI_V vs I_MK2i\npadj < 0.1 LFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
lwid = c(1,5),
col=my_palette,
cexCol = 1,
margins = c(8,3),
labRow = FALSE,
trace = "none",
dendrogram = "row",
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
pdf('TCT MK2i targets in TCT NI_V vs I_V vs I_MK2i RNASeq.pdf',
width = 9,
height = 11)
heatmap.2(heatmap_matrix,
main = "TCT MK2i targets\nTCT NI_V vs I_V vs I_MK2i\npadj < 0.1 LFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
lwid = c(1,5),
col=my_palette,
cexCol = 1,
margins = c(8,3),
labRow = FALSE,
trace = "none",
dendrogram = "row",
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
include_graphics('TCT MK2i targets in TCT NI_V vs I_V vs I_MK2i RNASeq.png')
# rearrange the TCT master count matrix
rawCountTable_transform_detected_TNF <- read_csv("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TNF/VST_TNF.csv", )
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## X1 = col_character()
## )
## See spec(...) for full column specifications.
row_names <- rawCountTable_transform_detected_TNF$X1
rawCountTable_transform_detected_TNF$X1 <- NULL
rawCountTable_transform_detected_reshape <- cbind(rawCountTable_transform_detected_TNF[,17:21], rawCountTable_transform_detected_TNF[,7:11], rawCountTable_transform_detected_TNF[,1:6])
rownames(rawCountTable_transform_detected_reshape) <- row_names
# plot the heatmap
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.25)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected_reshape)))
}
sig_count <- rawCountTable_transform_detected_reshape[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,7:22] <- zscore(as.numeric(sig_dif[i,7:22]))
}
heatmap_matrix <- as.matrix(sig_dif[,7:22])
png('TCT MK2i targets in TNF NI_V vs I_V vs I_MK2i RNASeq.png',
width = 600,
height = 700,
res = 100,
pointsize = 8)
heatmap.2(heatmap_matrix,
main = "TCT MK2i targets\nTNF I_V vs NI_V vs I_MK2i\npadj < 0.1 LFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
lwid = c(1,5),
col=my_palette,
cexCol = 1,
margins = c(8,3),
labRow = FALSE,
trace = "none",
dendrogram = "row",
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
pdf('TCT MK2i targets in TNF NI_V vs I_V vs I_MK2i RNASeq.pdf',
width = 9,
height = 11)
heatmap.2(heatmap_matrix,
main = "TCT MK2i targets\nTNF I_V vs NI_V vs I_MK2i\npadj < 0.1 LFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
lwid = c(1,5),
col=my_palette,
cexCol = 1,
margins = c(8,3),
labRow = FALSE,
trace = "none",
dendrogram = "row",
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
include_graphics('TCT MK2i targets in TNF NI_V vs I_V vs I_MK2i RNASeq.png')
# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$I_MKI_mean <- rowMeans(detected_pseudocount[,7:12])
dif_analysis$I_V_mean <- rowMeans(detected_pseudocount[,1:6])
(scatter <- ggplot(dif_analysis, aes(x = I_V_mean, y = I_MKI_mean)) +
xlab("I_V_Average(log2)") + ylab("I_MKI_Average(log2)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V Scatter Plot"))
pdf('ScatterPlot_I_MKI vs I_V.pdf')
scatter
dev.off()
## quartz_off_screen
## 2
# MA plot
(MA <- ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V MA Plot"))
pdf('MAPlot_I_MKI vs I_V.pdf')
MA
dev.off()
## quartz_off_screen
## 2
# Volcano Plot
(volcano <- ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.25), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.25), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V Volcano Plot"))
## Warning: Removed 44 rows containing missing values (geom_point).
pdf('VolcanoPlot_I_MKI vs I_V.pdf')
volcano
## Warning: Removed 44 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen
## 2
Set working directory to the folder that contains only gene count txt files
I am filtering out I_V3. Which is the weird one.
# Generate DESeqData using the counting result generated using Salmon
countFiles_icompare <- c(countFiles[7:12], countFiles[1:6])
countFiles_icompare <- countFiles_icompare[-3]
txi.salmon <- tximport(countFiles_icompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(7:12, 1:6), ]
DESeqsampletable <- DESeqsampletable[-3,]
DESeqsampletable$condition <- factor(c(rep('control', 5), rep('experimental', 6)))
ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ gender + batch + condition)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis, type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
MA plot was generated to inspect the correct shrinkage of LFC.
DESeq2::plotMA(ddsHTSeq_analysis)
This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf
rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering_I_MKI vs I_V_filtered.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
Final output is following:
I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.
The top 500 most variable genes are selected for PCA analysis.
plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500) +
geom_text(aes(label=name), vjust = 2) +
xlim(-50, 40) + ylim(-25, 30)
This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.
I’m applying a different filtering criteria here. After discussing with Lucia, we decided to only look at genes that are expressed in 3 or more samples in each group.
keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
if (sum(rawCountTable[i,1:5] != 0) >=2 | sum(rawCountTable[i,6:11] != 0) >= 2) {
keep <- c(keep, i)
}
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_I_MKI vs I_V_filtered.csv")
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) +
geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
theme(legend.position = "top") + xlab("pseudocounts")
This step generates the analysis file that contains results from differential analysis.
write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_I_MKI vs I_V_filtered.csv")
Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.
Here I relax the padj cutoff to 0.1
suppressMessages(library(mosaic))
rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_I_MKI vs I_V_filtered.csv")
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.1)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,7:17] <- zscore(as.numeric(sig_dif[i,7:17]))
}
heatmap_matrix <- as.matrix(sig_dif[,7:17])
png('I_MKI vs I_V RNASeq_filtered.png',
width = 600,
height = 1200,
res = 200,
pointsize = 8)
heatmap.2(heatmap_matrix,
main = "I_MKI vs I_V RNASeq\npadj < 0.1\nLFC > 0.25",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "both",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
# output number of significant DE genes
dim(sig_dif)[1]
## [1] 23
write.csv(sig_dif, "Significant genes_I_MKI vs I_V_filtered.csv")
Final output is
# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$I_MKI_mean <- rowMeans(detected_pseudocount[,6:11])
dif_analysis$I_V_mean <- rowMeans(detected_pseudocount[,1:5])
ggplot(dif_analysis, aes(x = I_V_mean, y = I_MKI_mean)) +
xlab("I_V_Average(log2)") + ylab("I_MKI_Average(log2)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V Scatter Plot")
# MA plot
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V MA Plot")
# Volcano Plot
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "I_MKI vs I_V Volcano Plot")
## Warning: Removed 38 rows containing missing values (geom_point).
Set working directory to the folder that contains only gene count txt files
# Generate DESeqData using the counting result generated using Salmon
countFiles_nicompare <- c(countFiles[19:25], countFiles[13:18])
countFiles_nicompare <- countFiles_nicompare[c(-3,-5)]
txi.salmon <- tximport(countFiles_nicompare, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- DESeqsampletable_all[c(19:25, 13:18), ]
DESeqsampletable <- DESeqsampletable[c(-3,-5),]
DESeqsampletable$condition <- factor(c(rep('control', 5), rep('experimental', 6)))
ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ batch + condition + gender)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis, type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
MA plot was generated to inspect the correct shrinkage of LFC.
DESeq2::plotMA(ddsHTSeq_analysis)
This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf
rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering_NI_MKI vs NI_V.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
pdf('Hierchical_Clustering_NI_MKI vs NI_V.pdf')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
include_graphics('Hierchical_Clustering_NI_MKI vs NI_V.png')
I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.
The top 500 most variable genes are selected for PCA analysis.
(PCA <- plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500))
pdf('PCA_TCT_NIMKi-vs-NIV.pdf',
height = 6,
width = 8)
PCA
dev.off()
## quartz_off_screen
## 2
This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.
keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
if (sum(rawCountTable[i,1:5] != 0) >=2 | sum(rawCountTable[i,6:11] != 0) >= 2) {
keep <- c(keep, i)
}
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_NI_MKI vs NI_V.csv")
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) +
geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
theme(legend.position = "top") + xlab("pseudocounts")
This step generates the analysis file that contains results from differential analysis.
write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_NI_MKI vs NI_V.csv")
Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.
Here I relaxed the padj cutoff to 0.1 (actually this didn’t change the number of DE genes, it’s the same as padj as 0.05)
suppressMessages(library(mosaic))
rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_NI_MKI vs NI_V.csv")
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.1)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,7:17] <- zscore(as.numeric(sig_dif[i,7:17]))
}
heatmap_matrix <- as.matrix(sig_dif[,7:17])
png('NI_MKI vs NI_V RNASeq.png',
width = 600,
height = 1200,
res = 200,
pointsize = 8)
heatmap.2(heatmap_matrix,
main = "NI_MKI vs NI_V RNASeq\npadj < 0.1\nLFC > 0.1",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "row",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
pdf('NI_MKI vs NI_V RNASeq.pdf',
width = 6,
height = 10)
heatmap.2(heatmap_matrix,
main = "NI_MKI vs NI_V RNASeq\npadj < 0.1\nLFC > 0.1",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "row",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
include_graphics('NI_MKI vs NI_V RNASeq.png')
# output number of significant DE genes
dim(sig_dif)[1]
## [1] 9
write.csv(sig_dif, "Significant genes_NI_MKI vs NI_V.csv")
# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$NI_MKI_mean <- rowMeans(detected_pseudocount[,6:11])
dif_analysis$NI_V_mean <- rowMeans(detected_pseudocount[,1:5])
(scatter <- ggplot(dif_analysis, aes(x = NI_V_mean, y = NI_MKI_mean)) +
xlab("NI_V_Average(log2)") + ylab("NI_MKI_Average(log2)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "NI_MKI vs NI_V Scatter Plot"))
pdf('ScatterPlot_NI_MKI vs NI_V.pdf')
scatter
dev.off()
## quartz_off_screen
## 2
# MA plot
(MA <- ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "NI_MKI vs NI_V MA Plot"))
pdf('MAPlot_NI_MKI vs NI_V.pdf')
MA
dev.off()
## quartz_off_screen
## 2
# Volcano Plot
(volcano <- ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < -0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "NI_MKI vs NI_V Volcano Plot"))
## Warning: Removed 54 rows containing missing values (geom_point).
pdf('VolcanoPlot_NI_MKI vs NI_V.pdf')
volcano
## Warning: Removed 54 rows containing missing values (geom_point).
dev.off()
## quartz_off_screen
## 2
# Generate DESeqData using the counting result generated using Salmon
countFiles_all <- c(countFiles[19:25], countFiles[7:12], countFiles[13:18], countFiles[1:6])
countFiles_all <- countFiles_all[-c(3,5)]
txi.salmon <- tximport(countFiles_all, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## transcripts missing from tx2gene: 30916
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable_all$condition <- c(rep("experimental", 6), rep("control", 6), rep("experimental",6), rep("control", 7))
DESeqsampletable <- rbind(DESeqsampletable_all[19:25,], DESeqsampletable_all[7:12,], DESeqsampletable_all[13:18,], DESeqsampletable_all[1:6,])
DESeqsampletable <- DESeqsampletable[-c(3,5),]
ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~ gender + batch + condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- DESeq(ddsHTSeq)
## estimating size factors
## using 'avgTxLength' from assays(dds), correcting for library size
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis, type = "normal")
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
##
## Note that type='apeglm' and type='ashr' have shown to have less bias than type='normal'.
## See ?lfcShrink for more details on shrinkage type, and the DESeq2 vignette.
## Reference: https://doi.org/10.1093/bioinformatics/bty895
MA plot was generated to inspect the correct shrinkage of LFC.
DESeq2::plotMA(ddsHTSeq_analysis)
This is the sanity check step to confirm that under a un-supervised clustering. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf
rawCountTable <- as.data.frame(DESeq2::counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Colloboration Projects/IBD RNA-Seq analysis/Analysis/TCT")
png('Hierchical_Clustering_MKI vs V.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
Final output is following:
I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.
The top 500 most variable genes are selected for PCA analysis.
plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500)
#geom_text(aes(label=name), vjust = 2)
#xlim(-45, 60) + ylim(-30, 80)
This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.
keep <- c()
for (i in 1:dim(rawCountTable)[1]) {
if (sum(rawCountTable[i,1:5] != 0) >=2 | sum(rawCountTable[i,6:11] != 0) >= 2 | sum(rawCountTable[i,12:17] != 0) >= 2 | sum(rawCountTable[i,18:23] != 0) >= 2) {
keep <- c(keep, i)
}
}
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,4))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_gene_counts_MKI vs V.csv")
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) +
geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
theme(legend.position = "top") + xlab("pseudocounts")
This step generates the analysis file that contains results from differential analysis.
write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_MKI vs V.csv")
Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.
suppressMessages(library(mosaic))
rawCountTable_transform_detected <- rawCountTable_transform[keep,]
write.csv(rawCountTable_transform_detected, "VST_MKI vs V.csv")
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.1 & abs(dif_analysis$log2FoldChange) > 0.1)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,7:29] <- zscore(as.numeric(sig_dif[i,7:29]))
}
heatmap_matrix <- as.matrix(sig_dif[,7:29])
png('MKI vs V RNASeq.png',
width = 800,
height = 1200,
res = 200,
pointsize = 8)
heatmap.2(heatmap_matrix,
main = "MKI vs V RNASeq\npadj < 0.1\nLFC > 0.1",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "row",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
include_graphics('MKI vs V RNASeq.png')
# output number of significant DE genes
dim(sig_dif)[1]
## [1] 9
write.csv(sig_dif, "Significant genes_MKI vs V.csv")
# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$MKI_mean <- rowMeans(detected_pseudocount[,12:23])
dif_analysis$V_mean <- rowMeans(detected_pseudocount[,1:11])
ggplot(dif_analysis, aes(x = V_mean, y = MKI_mean)) +
xlab("V_Average(log2)") + ylab("MKI_Average(log2)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < 0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "MKI vs V Scatter Plot")
# MA plot
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < 0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "MKI vs V MA Plot")
# Volcano Plot
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(pvalue,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange > 0.1), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.1 & log2FoldChange < 0.1), alpha = 0.5, size = 1, color = "blue") +
labs(title = "MKI vs V Volcano Plot")
## Warning: Removed 71 rows containing missing values (geom_point).
sessionInfo()
## R version 4.0.0 (2020-04-24)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Catalina 10.15.4
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRblas.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.0/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] grid parallel stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] mosaic_1.6.0 Matrix_1.2-18
## [3] mosaicData_0.17.0 ggformula_0.9.4
## [5] ggstance_0.3.4 knitr_1.28
## [7] pathview_1.28.0 org.Hs.eg.db_3.11.1
## [9] org.Mm.eg.db_3.11.1 DOSE_3.14.0
## [11] clusterProfiler_3.16.0 VennDiagram_1.6.20
## [13] futile.logger_1.4.3 dplyr_0.8.5
## [15] readr_1.3.1 RColorBrewer_1.1-2
## [17] gplots_3.0.3 mixOmics_6.12.0
## [19] MASS_7.3-51.6 reshape_0.8.8
## [21] lattice_0.20-41 ggplot2_3.3.0
## [23] EnsDb.Mmusculus.v79_2.99.0 ensembldb_2.12.1
## [25] AnnotationFilter_1.12.0 GenomicFeatures_1.40.0
## [27] AnnotationDbi_1.50.0 gridExtra_2.3
## [29] tximport_1.16.0 limma_3.44.1
## [31] DESeq2_1.28.0 SummarizedExperiment_1.18.1
## [33] DelayedArray_0.14.0 matrixStats_0.56.0
## [35] Biobase_2.48.0 GenomicRanges_1.40.0
## [37] GenomeInfoDb_1.24.0 IRanges_2.22.1
## [39] S4Vectors_0.26.0 BiocGenerics_0.34.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.0.0 htmlwidgets_1.5.1 RSQLite_2.2.0
## [4] BiocParallel_1.22.0 scatterpie_0.1.4 munsell_0.5.0
## [7] withr_2.2.0 colorspace_1.4-1 GOSemSim_2.14.0
## [10] labeling_0.3 KEGGgraph_1.48.0 urltools_1.7.3
## [13] GenomeInfoDbData_1.2.3 polyclip_1.10-0 bit64_0.9-7
## [16] farver_2.0.3 downloader_0.4 vctrs_0.2.4
## [19] generics_0.0.2 lambda.r_1.2.4 xfun_0.13
## [22] BiocFileCache_1.12.0 R6_2.4.1 graphlayouts_0.7.0
## [25] locfit_1.5-9.4 bitops_1.0-6 fgsea_1.14.0
## [28] gridGraphics_0.5-0 assertthat_0.2.1 scales_1.1.0
## [31] ggraph_2.0.2 enrichplot_1.8.1 gtable_0.3.0
## [34] tidygraph_1.1.2 rlang_0.4.6 genefilter_1.70.0
## [37] splines_4.0.0 rtracklayer_1.48.0 lazyeval_0.2.2
## [40] broom_0.5.6 europepmc_0.3 mosaicCore_0.6.0
## [43] BiocManager_1.30.10 yaml_2.2.1 reshape2_1.4.4
## [46] crosstalk_1.1.0.1 backports_1.1.6 qvalue_2.20.0
## [49] tools_4.0.0 ggplotify_0.0.5 ellipsis_0.3.0
## [52] ggdendro_0.1-20 ggridges_0.5.2 Rcpp_1.0.4.6
## [55] plyr_1.8.6 progress_1.2.2 zlibbioc_1.34.0
## [58] purrr_0.3.4 RCurl_1.98-1.2 prettyunits_1.1.1
## [61] openssl_1.4.1 viridis_0.5.1 cowplot_1.0.0
## [64] ggrepel_0.8.2 magrittr_1.5 data.table_1.12.8
## [67] RSpectra_0.16-0 futile.options_1.0.1 DO.db_2.9
## [70] triebeard_0.3.0 ProtGenerics_1.20.0 hms_0.5.3
## [73] evaluate_0.14 xtable_1.8-4 XML_3.99-0.3
## [76] leaflet_2.0.3 compiler_4.0.0 biomaRt_2.44.0
## [79] ellipse_0.4.1 tibble_3.0.1 KernSmooth_2.23-17
## [82] crayon_1.3.4 htmltools_0.4.0 corpcor_1.6.9
## [85] tidyr_1.0.3 geneplotter_1.66.0 DBI_1.1.0
## [88] tweenr_1.0.1 formatR_1.7 dbplyr_1.4.3
## [91] rappdirs_0.3.1 gdata_2.18.0 igraph_1.2.5
## [94] pkgconfig_2.0.3 rvcheck_0.1.8 GenomicAlignments_1.24.0
## [97] xml2_1.3.2 rARPACK_0.11-0 annotate_1.66.0
## [100] XVector_0.28.0 stringr_1.4.0 digest_0.6.25
## [103] graph_1.66.0 Biostrings_2.56.0 rmarkdown_2.1
## [106] fastmatch_1.1-0 curl_4.3 Rsamtools_2.4.0
## [109] gtools_3.8.2 lifecycle_0.2.0 nlme_3.1-147
## [112] jsonlite_1.6.1 viridisLite_0.3.0 askpass_1.1
## [115] pillar_1.4.4 KEGGREST_1.28.0 httr_1.4.1
## [118] survival_3.1-12 GO.db_3.11.1 glue_1.4.0
## [121] png_0.1-7 bit_1.1-15.2 Rgraphviz_2.32.0
## [124] ggforce_0.3.1 stringi_1.4.6 blob_1.2.1
## [127] caTools_1.18.0 memoise_1.1.0